This file belongs to the repository: https://github.com/drisso/awst_analysis.

The code is released with license GPL v3.0.

Benchmarking single cell RNA-sequencing analysis pipelines using mixture control experiments (Nature)

Install and load awst

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("drisso/awst")
#BiocManager::install("GIS-SP-Group/RCA")
#BiocManager::install("hgu133plus2.db")

Setup

rm(list = ls())
library(steFunctions)
library(dendextend)
library(clues)
setwd("~/Dropbox/AWST/mixology/")
jobName <- "mixoloy20200925"
source(url("https://raw.githubusercontent.com/drisso/awst_analysis/master/functions.R"))
#####
save_plots <- FALSE
png_width_large <- 2100
png_height_large <- 500
png_width_small <- width_png <- 700
png_height_small <- 700
png_res <- 1/300
####
results <- matrix(NA, ncol = 8, nrow = 170)
colnames(results) <- c("where", "what", "eca", "ecp", "ari", "G", "noOfClust_Th", "noOfClust_Est") 
k <- 0
get_RaceID <- function(eData, is_normalized = FALSE) {
  require(RaceID)
  
  
  if(!is_normalized) eData <- log2(as.matrix(eData) + 1)
    
  sc <- SCseq(as.data.frame(eData))
  
  sc@ndata = sc@expdata
  sc@genes = rownames(sc@ndata)
  sc@counts = rep(1,ncol(sc@ndata))
  names(sc@counts) = colnames(sc@ndata)
  sc@cluster$features = sc@genes

  sc <- compdist(sc, metric="pearson", FSelect = FALSE, knn = NULL)
  sc <- clustexp(sc, sat = TRUE, samp = NULL, cln = NULL, clustnr = 30,
               bootnr = 50, rseed = 17000, FUNcluster = "kmedoids")

  annotation.df$clustering = as.factor(sc@cluster$kpart)
  
  sc <- compumap(sc)
  tmp <- scale(sc@umap)
  annotation.df$umap.1 <- tmp[, 1]
  annotation.df$umap.2 <- tmp[, 2]

  sc <- comptsne(sc)
  tmp <- scale(sc@tsne)
  annotation.df$tsne.1 <- tmp[, 1]
  annotation.df$tsne.2 <- tmp[, 2]

  return(annotation.df)
}
get_RCA <- function(eData, is_normalized = FALSE) {

  if(prefix == "sc_10x_5cl") {
    wwhich <- grep("ENSG00", rownames(eData))
    rownames(eData)[wwhich] <- gsub("ENSG0", "", rownames(ddata)[wwhich])
    rownames(eData) <- paste("XXXX",rownames(eData),sep="_")
  } else {
    library(biomaRt)
    mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
    G_list <- getBM(filters="ensembl_gene_id",
                    attributes=c("external_gene_name","ensembl_gene_id","external_gene_source"),
                    values=rownames(ddata), mart=mart)
    G_list = G_list[G_list$external_gene_source == "HGNC Symbol",]
    G_list$format_name =   paste("XXXX",G_list$external_gene_name,G_list$ensembl_gene_id,sep="_")
    rownames(G_list) = G_list$ensembl_gene_id

    both <- intersect(rownames(G_list), rownames(eData))
    eData <- eData[both,]
    G_list <- G_list[both,]

    rownames(eData) <- G_list$format_name
  }
  
  require(SingleCellExperiment)
  if(!is_normalized) {
    sce <- SingleCellExperiment(
      assays = list(
        counts = eData,
        logcounts = log2(eData + 1)
      ), colData = annotation.df) 
  } else {
    sce <- SingleCellExperiment(
      assays = list(
        counts = 2^eData,
        logcounts = eData
      ), colData = annotation.df)
  }
  
  require(preprocessCore)
  require(flashClust)
  require(RCA)
  
  data_obj = dataConstruct(as.matrix(logcounts(sce)))
  data_obj$geneFilter <- rep(TRUE, nrow(logcounts(sce)))
  data_obj$fpkm_transformed <- data_obj$fpkm_raw
  data_obj = featureConstruct(data_obj, method = "GlobalPanel")

  set.seed(20742579)
  data_obj = cellClust(data_obj, method = "hclust", deepSplit_wgcna = 1,
                       min_group_Size_wgcna = 5)

  annotation.df$clustering <- factor(data_obj$group_labels_color$groupLabel)
  pr = scale(prcomp(t(scale(data_obj$fpkm_for_clust)))$x)
  annotation.df$pca.1 <- pr[, 1]
  annotation.df$pca.2 <- pr[, 2]

  return(annotation.df)
}
get_SC3 <- function(eData, is_normalized =FALSE) {
  require(SingleCellExperiment)
  require(SC3)
  
  
  if(!is_normalized) {
    sce <- SingleCellExperiment(
      assays = list(
      counts = eData,
      logcounts = log2(eData + 1)
      ), colData = annotation.df) 
    } else {
      sce <- SingleCellExperiment(
        assays = list(
          counts = 2^eData,
          logcounts = eData
          ), colData = annotation.df)
      
    }
  
  rowData(sce)$feature_symbol <- rownames(sce)
  
  sce <- sc3_estimate_k(sce)
  k_est <- sce@metadata$sc3$k_estimation
  sce <- sc3(sce, ks = k_est, biology = FALSE, n_cores=2,  k_estimator = FALSE, rand_seed=2333333, gene_filter = FALSE)
  
  eval(parse(text=paste0("ans <- colData(sce)$sc3_", k_est, "_clusters")))

  annotation.df$clustering <- ans
  return(annotation.df)
}  
get_Seurat <- function(eData, resolution = 0.6, is_normalized = FALSE) {
  
  require(Seurat)
  
  hi_var_features <- rownames(eData)
  
  srt <- CreateSeuratObject(eData, project = prefix)
  
  if(!is_normalized) {
    
    srt <- NormalizeData(srt, normalization.method = "LogNormalize", scale.factor = 10000, verbose = FALSE)
    srt <- ScaleData(srt, features = hi_var_features, verbose = FALSE)
  } else {
    
    srt@assays$RNA@scale.data <- eData
  }
  
  srt <- RunPCA(srt, features = hi_var_features, verbose = FALSE)
  
  srt <- FindNeighbors(srt, verbose = FALSE)
  srt <- FindClusters(object = srt, resolution = resolution, 
                      algorithm = 1, n.start = 100, graph.name = NULL,
                      n.iter = 10, random.seed = 0, verbose = FALSE)
  
  annotation.df$clustering = as.factor(srt@active.ident)
  
  tmp <- scale(srt@reductions$pca@cell.embeddings)
  annotation.df$pca.1 <- tmp[, 1]
  annotation.df$pca.2 <- tmp[, 2]
  
  srt <- RunUMAP(srt, features = hi_var_features, verbose = FALSE)
  tmp <- scale(srt@reductions$umap@cell.embeddings)
  annotation.df$umap.1 <- tmp[, 1]
  annotation.df$umap.2 <- tmp[, 2]
  
  srt <- RunTSNE(srt, features = hi_var_features)
  tmp <- scale(srt@reductions$tsne@cell.embeddings)
  annotation.df$tsne.1 <- tmp[, 1]
  annotation.df$tsne.2 <- tmp[, 2]
  
  return(annotation.df)
}
get_clusterExperiment <- function(eData, is_normalized = FALSE) {
  require(clusterExperiment)
  
  if(!is_normalized) {
    sce <- SingleCellExperiment(
      assays = list(
        counts = eData,
        logcounts = log2(eData + 1)
      ), colData = annotation.df) 
  } else {
    sce <- SingleCellExperiment(
      assays = list(
        counts = 2^eData,
        logcounts = eData
      ), colData = annotation.df)
  }
  
  reducedDim(sce, "PCA") <- irlba::prcomp_irlba(assays(sce, "logcounts", n=5))$x 
  
  se = RSEC(sce, 
            whichAssay = "logcounts", 
            minSizes=5, reduceMethod="PCA", nReducedDims=5, ncores=4, random.seed=176201,
            dendroReduce="PCA", dendroNDims = 5, 
            mergeMethod = "adjP", mergeDEMethod = "limma", 
            mergeCutoff = 0.1, mergeLogFCcutoff = 1)
  
  tmp <- primaryCluster(se)
  tmp[tmp < 0] <- NA
  
  annotation.df$clustering <- tmp
  
  return(annotation.df)
}

sc_10x_5cl

con <- gzcon(url("https://github.com/LuyiTian/sc_mixology/blob/master/data/csv/sc_10x_5cl.metadata.csv.gz?raw=true"))
annotation.df <- read.csv(textConnection(readLines(con)))
annotation.df$cell.col <- factor(annotation.df$cell_line)
levels(annotation.df$cell.col) <- c("gold", "red", "blue", "magenta", "green3")
annotation.df$cell.col <- as.character(annotation.df$cell.col)
annotation.df$cell_line <- paste0("mix", annotation.df$mix)

con <- gzcon(url("https://github.com/LuyiTian/sc_mixology/blob/master/data/csv/sc_10x_5cl.count.csv.gz?raw=true"))
ddata <- read.csv(textConnection(readLines(con)))

both <- intersect(colnames(ddata), rownames(annotation.df))
ddata <- ddata[, both]
annotation.df <- annotation.df[both,]
prefix <- "sc_10x_5cl"
save(ddata, annotation.df, prefix, file = paste0(prefix, "_counts.RData"))
require(awst)
require(EDASeq)
require(Rtsne)
require(umap)
require(SingleCellExperiment)
####
#load(paste0(prefix, "_counts.RData"))
#load(paste0(prefix, "_expression.RData"))
####
no_of_detected_gene_per_sample <- colSums(ddata > 0) 
fivenum(no_of_detected_gene_per_sample)
ddata <- EDASeq::betweenLaneNormalization(as.matrix(ddata), which = "full", round = FALSE)
# apply the AWS-transformation 
tmp <- rowSums(ddata)
sum(tmp > 0)
tmp <- colSums(ddata)
sum(tmp > 0)

exprData <- awst(ddata, poscount = TRUE, full_quantile = TRUE)
sum(is.na(rowSums(exprData)))
sum(is.na(colSums(exprData)))

save(exprData, prefix, file = paste0(prefix, "_expression.RData"))
#dim(exprData <- gene_filter(exprData))
#write.table(exprData, file =  paste0(prefix, "_expression.tsv"), sep = "\t")

nrow_exprData <- nrow(exprData)
ncol_exprData <- ncol(exprData)
ddist <- dist(exprData)
save(ddist, nrow_exprData, ncol_exprData, prefix, file = paste0(prefix, "_expression_dist.RData"))

hhc <- hclust(ddist, method = "ward.D2")

aCalinski <- calinski(hhc)

pprcomp <- prcomp(exprData) # Run PC analysis
pprcomp$x <- pprcomp$x[, 1:5]
pprcomp$rotation <- pprcomp$rotation[, 1:5]

set.seed(2020) 
ans_Rtsne <- Rtsne(exprData, pca = FALSE) # Run TSNE

set.seed(2020) 
ans_umap <- umap(exprData) # Run Umap

require(SingleCellExperiment)

rm(tmp)
tmp <- get_RCA(ddata)
wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
ans_RCA_raw  <- tmp[, -wwhich]
#    user   system  elapsed 

rm(tmp)
system.time({tmp <- get_RCA(t(exprData), is_normalized = TRUE)})
wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
ans_RCA_awst  <- tmp[, -wwhich]
#    user   system  elapsed 

system.time({ans_SC3_raw <- get_SC3(as.matrix(ddata))[, c("cell.col", "clustering")]})
#    user   system  elapsed 
# 130.476    9.860 2821.177 sc_10x_5cl
#  12.496    2.868  474.091 sc_10x
#   2.284    0.892   55.425 RNAmix_celseq2

system.time({ans_SC3_awst <- get_SC3(t(exprData), is_normalized = TRUE)[, c("cell.col",  "clustering")]})
#    user   system  elapsed 
# 150.964   13.932 3057.959 sc_10x_5cl
#  12.664    2.572  466.772 sc_10x
#   2.540    0.888   57.347 RNAmix_celseq2

system.time({ans_clusterExp_raw <- get_clusterExperiment(ddata, is_normalized = FALSE)[, c("cell.col",  "clustering")]})

system.time({ans_clusterExp_awst <- get_clusterExperiment(t(exprData), is_normalized = TRUE)[, c("cell.col",  "clustering")]})

rm(tmp)
system.time({tmp <- get_Seurat(ddata, is_normalized = FALSE)})
wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
ans_Seurat_raw_LoR  <- tmp[, -wwhich]
#   user  system elapsed 
# 14.412   0.432  14.902 RNAmix_celseq2
# 36.192   0.832  37.031 sc_10x

rm(tmp)
system.time({tmp <- get_Seurat(t(exprData), is_normalized = TRUE)})
wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
ans_Seurat_awst_LoR  <- tmp[, -wwhich]
#   user  system elapsed 
# 11.464   0.184  11.651 RNAmix_celseq2
# 33.304   1.124  34.431 sc_10x

rm(tmp)
system.time({tmp <- get_Seurat(ddata, is_normalized = FALSE, resolution = 1.6)})
wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
ans_Seurat_raw_HiR  <- tmp[, -wwhich]
#   user  system elapsed 
# 11.560   0.152  11.713 RNAmix_celseq2
# 35.472   0.792  36.280 sc_10x

rm(tmp)
system.time({tmp <- get_Seurat(t(exprData), is_normalized = TRUE, resolution = 1.6)})
wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
ans_Seurat_awst_HiR  <- tmp[, -wwhich]
#   user  system elapsed 
# 11.692   0.140  11.836 RNAmix_celseq2
# 34.184   1.232  35.418 sc_10x

rm(tmp)
system.time({tmp <- get_RaceID(ddata, is_normalized = FALSE)})
wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
ans_RaceID_raw  <- tmp[, -wwhich]
#   user  system elapsed 
#  7.004   0.116   7.131 RNAmix_celseq2
# 41.932   0.532  42.429 sc_10x
  
rm(tmp)
system.time({tmp <- get_RaceID(2^t(exprData), is_normalized = TRUE)})
wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
ans_RaceID_awst  <- tmp[, -wwhich]
#   user  system elapsed 
#  5.784   0.092   5.866 RNAmix_celseq2
# 47.252   0.560  47.759 sc_10x

### VST  
if(!file.exists(paste0(prefix, "_VST.RData"))) {
  load(paste0(prefix, "_counts.RData"))
  require(DESeq2)
  dds <- DESeqDataSetFromMatrix(countData = as.matrix(ddata), 
                              colData = annotation.df, 
                              design = ~1)
# vvst <- vst(dds, blind=FALSE)
#Error in vst(dds, blind = FALSE) : 
#  less than 'nsub' rows with mean normalized count > 5, 
#  it is recommended to use varianceStabilizingTransformation directly

  system.time({vvst <- varianceStabilizingTransformation(dds, blind=FALSE)})
#    user   system  elapsed 
#1426.620    2.388 1430.940 sc_10x
# 157.372    0.020  157.395 RNAmix
  
  vstData <- assay(vvst)
  save(vstData, file = paste0(prefix, "_VST.RData"))
} else load(paste0(prefix, "_VST.RData"))
  
  rm(tmp)
  system.time({tmp <- get_Seurat(vstData, is_normalized = TRUE)})
#   user  system elapsed 
#267.628   5.892 327.423 sc_10x_5cl
# 10.652   0.112  10.765 RNAmix_celseq2
# 33.912   0.584  34.494 sc_10x
  wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
  ans_Seurat_VST_LoR  <- tmp[, -wwhich]
  
  rm(tmp)
  system.time({tmp <- get_Seurat(vstData, is_normalized = TRUE, resolution = 1.6)})
  wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
  ans_Seurat_VST_HiR  <- tmp[, -wwhich]
#   user  system elapsed 
#286.520   5.096 321.105 sc_10x_5cl
# 11.080   0.128  11.212 RNAmix_celseq2
# 34.228   0.620  34.858 sc_10x
  
  rm(tmp)
  system.time({tmp <- get_RaceID(2^vstData, is_normalized = TRUE)})
  wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
  ans_RaceID_VST  <- tmp[, -wwhich]
#    user   system  elapsed 
#2743.628    4.196 2777.134 sc_10x_5cl
#   8.992    0.096    9.055 RNAmix_celseq2
#  51.904    0.744   52.610 sc_10x

  
system.time({ans_SC3_VST <- get_SC3(vstData, is_normalized = TRUE)[, c("cell.col",  "clustering")]})
#    user   system  elapsed 
# 145.884   10.376 2626.021 sc_10x_5cl
#  13.448    2.224  459.802 sc_10x
#   2.460    0.836   53.050 RNAmix_celseq2

system.time({ans_clusterExp_VST <- get_clusterExperiment(vstData, is_normalized = TRUE)[, c("cell.col",  "clustering")]})
  
rm(tmp)
system.time({tmp <- get_RCA(vstData, is_normalized = TRUE)})
wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
ans_RCA_vst  <- tmp[, -wwhich]  
#   user  system elapsed 
#  8.208   0.000  40.342 
  
  
if(!file.exists(paste0(prefix, "_scry.RData"))) {
  load(paste0(prefix, "_counts.RData"))
  
  library(scry)
  scryData_pearson <- nullResiduals(as.matrix(ddata), type = "pearson")
  save(scryData_pearson, file = paste0(prefix, "_scry.RData"))
  
} else load(paste0(prefix, "_scry.RData"))
  
  rm(tmp)
  system.time({tmp <- get_Seurat(scryData_pearson, is_normalized = TRUE)})
  wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
  ans_Seurat_scry_LoR  <- tmp[, -wwhich]
#   user  system elapsed 
#284.504   8.032 356.628 sc_10x_5cl
# 16.140   0.172  16.313 RNAmix_celseq2
# 34.216   0.724  34.948 sc_10x
  
  rm(tmp)
  system.time({tmp <- get_Seurat(scryData_pearson, is_normalized = TRUE, resolution = 1.6)})
  wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
  ans_Seurat_scry_HiR  <- tmp[, -wwhich]
#   user  system elapsed 
#293.972   2.968 320.115 sc_10x_5cl 
# 15.428   0.192  15.620 RNAmix_celseq2
# 34.496   0.908  35.406 sc_10x
  
  
  rm(tmp)
  system.time({tmp <- get_RaceID(2^scryData_pearson, is_normalized = TRUE)})
  wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
  ans_RaceID_scry  <- tmp[, -wwhich]
#    user   system  elapsed 
#2387.520    2.580 2407.494 sc_10x_5cl
#   6.892    0.100    6.976 RNAmix_celseq2
#  59.864    0.528   60.362 sc_10x

  rm(tmp)
  system.time({tmp <- get_RCA(scryData_pearson, is_normalized = TRUE)})
  wwhich <- which(colnames(tmp) %in% colnames(annotation.df))
  ans_RCA_scry  <- tmp[, -wwhich]
#    user   system  elapsed 
#2387.520    2.580 2407.494 sc_10x_5cl
#   7.920    0.000  309.245 sc_10x
  
system.time({ans_SC3_scry <- get_SC3(scryData_pearson, is_normalized = TRUE)[, c("cell.col",  "clustering")]})
#    user   system  elapsed 
# 143.712   11.704 4087.033 sc_10x_5cl
#  14.312    3.232 1097.622 sc_10x
#   2.484    0.848   76.398 RNAmix_celseq2

system.time({ans_clusterExp_scry <- get_clusterExperiment(scryData, is_normalized = TRUE)[, c("cell.col",  "clustering")]})

if(file.exists(paste0(prefix, "_rsec_working.RData"))) { 
  load(paste0(prefix, "_rsec_working.RData"))
  
  rm(tmp)
  tmp <- annotation.df[attr(ans_clusterExp_raw, "rownames"),]
  tmp$clustering <- attr(ans_clusterExp_raw, "listData")$clustering_res
  tmp$clustering[tmp$clustering < 0] <- NA
  tmp$clustering <- factor(tmp$clustering)
  ans_clusterExp_raw <- tmp[, c("cell.col", "clustering")]

  rm(tmp)
  tmp <- annotation.df[attr(ans_clusterExp_awst, "rownames"),]
  tmp$clustering <- attr(ans_clusterExp_awst, "listData")$clustering_res
  tmp$clustering[tmp$clustering < 0] <- NA
  tmp$clustering <- factor(tmp$clustering)
  ans_clusterExp_awst <- tmp[, c("cell.col", "clustering")]

  rm(tmp)
  tmp <- annotation.df[attr(ans_clusterExp_scry, "rownames"),]
  tmp$clustering <- attr(ans_clusterExp_scry, "listData")$clustering_res
  tmp$clustering[tmp$clustering < 0] <- NA
  tmp$clustering <- factor(tmp$clustering)
  ans_clusterExp_scry <- tmp[, c("cell.col", "clustering")]

  tmp <- annotation.df[attr(ans_clusterExp_vst, "rownames"),]
  tmp$clustering <- attr(ans_clusterExp_vst, "listData")$clustering_res
  tmp$clustering[tmp$clustering < 0] <- NA
  tmp$clustering <- factor(tmp$clustering)
  ans_clusterExp_vst <- tmp[, c("cell.col", "clustering")]
}
  
  
  
  
save(hhc, nrow_exprData, ncol_exprData, annotation.df, aCalinski,
     pprcomp, ans_Rtsne, ans_umap, prefix, 
     ans_RaceID_awst, ans_RaceID_raw,
     ans_RaceID_scry, 
     ans_RaceID_VST,
     ans_RCA_raw, ans_RCA_awst, 
     ans_RCA_vst, ans_RCA_scry,
     ans_SC3_raw, ans_SC3_awst, 
     ans_SC3_VST, ans_SC3_scry, 
     ans_Seurat_raw_LoR, ans_Seurat_raw_HiR,
     ans_Seurat_awst_LoR, ans_Seurat_awst_HiR,
     ans_Seurat_VST_LoR, ans_Seurat_VST_HiR,
     ans_Seurat_scry_LoR, ans_Seurat_scry_HiR,
     ans_clusterExp_raw, ans_clusterExp_awst,
     ans_clusterExp_scry, ans_clusterExp_vst, 
     file = paste0(prefix, "_expression_working.RData"))

Dendrogram of sc_10x_5cl

## cluster accuracy (eca): 0.9861
## cluster purity (ecp): 0.9806
## adjusted Rand's index (ari): 0.9632
## G index (geometric average of eca, ecp, and ari): 0.9766
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 5

PCA of sc_10x_5cl

t-SNE of sc_10x_5cl

umap of sc_10x_5cl

counts + RaceID (default + umap) of sc_10x_5cl

## cluster accuracy (eca): 0.9767
## cluster purity (ecp): 0.8551
## adjusted Rand's index (ari): 0.5137
## G index (geometric average of eca, ecp, and ari): 0.7542
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 12

AWST + RaceID (default + umap) of sc_10x_5cl

## cluster accuracy (eca): 0.9853
## cluster purity (ecp): 0.8087
## adjusted Rand's index (ari): 0.4089
## G index (geometric average of eca, ecp, and ari): 0.6881
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 17

VST + RaceID (default + umap) of sc_10x_5cl

## cluster accuracy (eca): 0.961
## cluster purity (ecp): 0.9081
## adjusted Rand's index (ari): 0.6685
## G index (geometric average of eca, ecp, and ari): 0.8356
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 8

Townes + RaceID (default + umap) of sc_10x_5cl

## cluster accuracy (eca): 0.8172
## cluster purity (ecp): 0.9366
## adjusted Rand's index (ari): 0.1493
## G index (geometric average of eca, ecp, and ari): 0.4852
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 2

counts + RCA (pca) of sc_10x_5cl

## cluster accuracy (eca): 0.9161
## cluster purity (ecp): 0.8042
## adjusted Rand's index (ari): 0.4973
## G index (geometric average of eca, ecp, and ari): 0.7155
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 19

AWST + RCA (pca) of sc_10x_5cl

## cluster accuracy (eca): 0.9044
## cluster purity (ecp): 0.7996
## adjusted Rand's index (ari): 0.4613
## G index (geometric average of eca, ecp, and ari): 0.6936
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 20

vst + RCA (pca) of sc_10x_5cl

## cluster accuracy (eca): 0.9161
## cluster purity (ecp): 0.8042
## adjusted Rand's index (ari): 0.4973
## G index (geometric average of eca, ecp, and ari): 0.7155
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 19

Townes + RCA (pca) of sc_10x_5cl

## cluster accuracy (eca): 0.928
## cluster purity (ecp): 0.5664
## adjusted Rand's index (ari): 0.1112
## G index (geometric average of eca, ecp, and ari): 0.3881
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 109

counts + Seurat (low resolution, pca) of sc_10x_5cl

## cluster accuracy (eca): 0.9862
## cluster purity (ecp): 0.8837
## adjusted Rand's index (ari): 0.6
## G index (geometric average of eca, ecp, and ari): 0.8056
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 12

awst + Seurat (low resolution, pca) of sc_10x_5cl

## cluster accuracy (eca): 0.9845
## cluster purity (ecp): 0.8366
## adjusted Rand's index (ari): 0.4787
## G index (geometric average of eca, ecp, and ari): 0.7333
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 15

VST + Seurat (low resolution, pca) of sc_10x_5cl

## cluster accuracy (eca): 0.9869
## cluster purity (ecp): 0.8854
## adjusted Rand's index (ari): 0.5757
## G index (geometric average of eca, ecp, and ari): 0.7953
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 12

Townes + Seurat (low resolution, pca) of sc_10x_5cl

## cluster accuracy (eca): 0.9851
## cluster purity (ecp): 0.8765
## adjusted Rand's index (ari): 0.6074
## G index (geometric average of eca, ecp, and ari): 0.8064
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 12

counts + Seurat (high resolution, pca) of sc_10x_5cl

## cluster accuracy (eca): 0.9837
## cluster purity (ecp): 0.7908
## adjusted Rand's index (ari): 0.3455
## G index (geometric average of eca, ecp, and ari): 0.6453
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 19

awst + Seurat (high resolution, pca) of sc_10x_5cl

## cluster accuracy (eca): 0.9848
## cluster purity (ecp): 0.7871
## adjusted Rand's index (ari): 0.3394
## G index (geometric average of eca, ecp, and ari): 0.6408
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 20

VST + Seurat (high resolution, pca) of sc_10x_5cl

## cluster accuracy (eca): 0.9846
## cluster purity (ecp): 0.7921
## adjusted Rand's index (ari): 0.3488
## G index (geometric average of eca, ecp, and ari): 0.648
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 19

Townes + Seurat (high resolution, pca) of sc_10x_5cl

## cluster accuracy (eca): 0.9833
## cluster purity (ecp): 0.8208
## adjusted Rand's index (ari): 0.4275
## G index (geometric average of eca, ecp, and ari): 0.7014
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 16

counts + SC3 of sc_10x_5cl

## cluster accuracy (eca): 0.9811
## cluster purity (ecp): 0.8702
## adjusted Rand's index (ari): 0.662
## G index (geometric average of eca, ecp, and ari): 0.8268
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 16

awst + SC3 of sc_10x_5cl

## cluster accuracy (eca): 0.9861
## cluster purity (ecp): 0.8265
## adjusted Rand's index (ari): 0.4292
## G index (geometric average of eca, ecp, and ari): 0.7046
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 20

VST + SC3 of sc_10x_5cl

## cluster accuracy (eca): 0.9843
## cluster purity (ecp): 0.8792
## adjusted Rand's index (ari): 0.6743
## G index (geometric average of eca, ecp, and ari): 0.8356
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 15

Townes + SC3 of sc_10x_5cl

## cluster accuracy (eca): 0.9685
## cluster purity (ecp): 0.627
## adjusted Rand's index (ari): 0.1299
## G index (geometric average of eca, ecp, and ari): 0.4289
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 76

counts + clustExp of sc_10x_5cl

## cluster accuracy (eca): 0.9809
## cluster purity (ecp): 0.7198
## adjusted Rand's index (ari): 0.268
## G index (geometric average of eca, ecp, and ari): 0.5741
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 33

awst + clustExp of sc_10x_5cl

## cluster accuracy (eca): 0.9881
## cluster purity (ecp): 0.8742
## adjusted Rand's index (ari): 0.5816
## G index (geometric average of eca, ecp, and ari): 0.795
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 14

VST + clustExp of sc_10x_5cl

## cluster accuracy (eca): 0.9722
## cluster purity (ecp): 0.8234
## adjusted Rand's index (ari): 0.4267
## G index (geometric average of eca, ecp, and ari): 0.699
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 15

Townes + clustExp of sc_10x_5cl

## cluster accuracy (eca): 0.9737
## cluster purity (ecp): 0.9243
## adjusted Rand's index (ari): 0.7998
## G index (geometric average of eca, ecp, and ari): 0.8962
## no. of clusters in theoretical partition: 5
## no. of clusters in estimated partition: 7

sc_10x

Dendrogram of sc_10x

## cluster accuracy (eca): 0.9986
## cluster purity (ecp): 0.9988
## adjusted Rand's index (ari): 0.9967
## G index (geometric average of eca, ecp, and ari): 0.998
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 3

PCA of sc_10x

t-SNE of sc_10x

umap of sc_10x

counts + RaceID (default + umap) of sc_10x

## cluster accuracy (eca): 0.9983
## cluster purity (ecp): 0.7338
## adjusted Rand's index (ari): 0.2877
## G index (geometric average of eca, ecp, and ari): 0.5951
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 15

AWST + RaceID (default + umap) of sc_10x

## cluster accuracy (eca): 0.9975
## cluster purity (ecp): 0.7828
## adjusted Rand's index (ari): 0.3667
## G index (geometric average of eca, ecp, and ari): 0.6591
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 11

VST + RaceID (default + umap) of sc_10x

## cluster accuracy (eca): 0.9918
## cluster purity (ecp): 0.8339
## adjusted Rand's index (ari): 0.4737
## G index (geometric average of eca, ecp, and ari): 0.7318
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 8

Townes + RaceID (default + umap) of sc_10x

## cluster accuracy (eca): 0.8388
## cluster purity (ecp): 1
## adjusted Rand's index (ari): 0
## G index (geometric average of eca, ecp, and ari): 0
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 1

counts + RCA (pca) of sc_10x

## cluster accuracy (eca): 0.8792
## cluster purity (ecp): 0.8204
## adjusted Rand's index (ari): 0.2626
## G index (geometric average of eca, ecp, and ari): 0.5743
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 6

AWST + RCA (pca) of sc_10x

## cluster accuracy (eca): 0.9035
## cluster purity (ecp): 0.826
## adjusted Rand's index (ari): 0.3623
## G index (geometric average of eca, ecp, and ari): 0.6466
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 6

vst + RCA (pca) of sc_10x

## cluster accuracy (eca): 0.8792
## cluster purity (ecp): 0.8204
## adjusted Rand's index (ari): 0.2626
## G index (geometric average of eca, ecp, and ari): 0.5743
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 6

Townes + RCA (pca) of sc_10x

## cluster accuracy (eca): 0.9326
## cluster purity (ecp): 0.6146
## adjusted Rand's index (ari): 0.1716
## G index (geometric average of eca, ecp, and ari): 0.4616
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 30

counts + Seurat (low resolution, pca) of sc_10x

## cluster accuracy (eca): 0.9961
## cluster purity (ecp): 0.8624
## adjusted Rand's index (ari): 0.5615
## G index (geometric average of eca, ecp, and ari): 0.7842
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 7

awst + Seurat (low resolution, pca) of sc_10x

## cluster accuracy (eca): 0.9961
## cluster purity (ecp): 0.8616
## adjusted Rand's index (ari): 0.5564
## G index (geometric average of eca, ecp, and ari): 0.7817
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 7

VST + Seurat (low resolution, pca) of sc_10x

## cluster accuracy (eca): 0.9975
## cluster purity (ecp): 0.8341
## adjusted Rand's index (ari): 0.464
## G index (geometric average of eca, ecp, and ari): 0.7281
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 8

Townes + Seurat (low resolution, pca) of sc_10x

## cluster accuracy (eca): 0.9972
## cluster purity (ecp): 0.9064
## adjusted Rand's index (ari): 0.7032
## G index (geometric average of eca, ecp, and ari): 0.8598
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 6

counts + Seurat (high resolution, pca) of sc_10x

## cluster accuracy (eca): 0.997
## cluster purity (ecp): 0.7973
## adjusted Rand's index (ari): 0.3923
## G index (geometric average of eca, ecp, and ari): 0.6781
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 10

awst + Seurat (high resolution, pca) of sc_10x

## cluster accuracy (eca): 0.9968
## cluster purity (ecp): 0.7695
## adjusted Rand's index (ari): 0.3452
## G index (geometric average of eca, ecp, and ari): 0.6421
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 12

VST + Seurat (high resolution, pca) of sc_10x

## cluster accuracy (eca): 0.9968
## cluster purity (ecp): 0.7784
## adjusted Rand's index (ari): 0.3518
## G index (geometric average of eca, ecp, and ari): 0.6487
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 11

Townes + Seurat (high resolution, pca) of sc_10x

## cluster accuracy (eca): 0.9967
## cluster purity (ecp): 0.7935
## adjusted Rand's index (ari): 0.3829
## G index (geometric average of eca, ecp, and ari): 0.6716
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 10

counts + SC3 of sc_10x

## cluster accuracy (eca): 0.9578
## cluster purity (ecp): 0.8545
## adjusted Rand's index (ari): 0.5415
## G index (geometric average of eca, ecp, and ari): 0.7624
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 8

awst + SC3 of sc_10x

## cluster accuracy (eca): 0.8716
## cluster purity (ecp): 0.856
## adjusted Rand's index (ari): 0.5448
## G index (geometric average of eca, ecp, and ari): 0.7408
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 8

VST + SC3 of sc_10x

## cluster accuracy (eca): 0.8533
## cluster purity (ecp): 0.8901
## adjusted Rand's index (ari): 0.6838
## G index (geometric average of eca, ecp, and ari): 0.8038
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 7

Townes + SC3 of sc_10x

## cluster accuracy (eca): 0.9891
## cluster purity (ecp): 0.5296
## adjusted Rand's index (ari): 0.0943
## G index (geometric average of eca, ecp, and ari): 0.3669
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 50

counts + clustExp of sc_10x

## cluster accuracy (eca): 0.9985
## cluster purity (ecp): 0.7224
## adjusted Rand's index (ari): 0.2874
## G index (geometric average of eca, ecp, and ari): 0.5918
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 17

awst + clustExp of sc_10x

## cluster accuracy (eca): 0.9994
## cluster purity (ecp): 0.8615
## adjusted Rand's index (ari): 0.5808
## G index (geometric average of eca, ecp, and ari): 0.7938
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 8

VST + clustExp of sc_10x

## cluster accuracy (eca): 0.9991
## cluster purity (ecp): 0.9187
## adjusted Rand's index (ari): 0.7194
## G index (geometric average of eca, ecp, and ari): 0.8708
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 5

Townes + clustExp of sc_10x

## cluster accuracy (eca): 0.9965
## cluster purity (ecp): 0.9393
## adjusted Rand's index (ari): 0.8016
## G index (geometric average of eca, ecp, and ari): 0.9087
## no. of clusters in theoretical partition: 3
## no. of clusters in estimated partition: 5

RNAmix_celseq2

Dendrogram of RNAmix_celseq2

## cluster accuracy (eca): 0.7685
## cluster purity (ecp): 0.8688
## adjusted Rand's index (ari): 0.337
## G index (geometric average of eca, ecp, and ari): 0.6082
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 4

PCA of RNAmix_celseq2

t-SNE of RNAmix_celseq2

umap of RNAmix_celseq2

counts + RaceID (default + umap) of RNAmix_celseq2

## cluster accuracy (eca): 0.8033
## cluster purity (ecp): 0.8719
## adjusted Rand's index (ari): 0.4567
## G index (geometric average of eca, ecp, and ari): 0.6839
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 5

AWST + RaceID (default + umap) of RNAmix_celseq2

## cluster accuracy (eca): 0.8291
## cluster purity (ecp): 0.9185
## adjusted Rand's index (ari): 0.5029
## G index (geometric average of eca, ecp, and ari): 0.7262
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 4

VST + RaceID (default + umap) of RNAmix_celseq2

## cluster accuracy (eca): 0.8425
## cluster purity (ecp): 0.6821
## adjusted Rand's index (ari): 0.3659
## G index (geometric average of eca, ecp, and ari): 0.5947
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 15

Townes + RaceID (default + umap) of RNAmix_celseq2

## cluster accuracy (eca): 0.6547
## cluster purity (ecp): 0.8763
## adjusted Rand's index (ari): 0.051
## G index (geometric average of eca, ecp, and ari): 0.3081
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 2

counts + RCA (pca) of RNAmix_celseq2

## cluster accuracy (eca): 0.7208
## cluster purity (ecp): 0.8634
## adjusted Rand's index (ari): 0.5072
## G index (geometric average of eca, ecp, and ari): 0.6809
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 7

AWST + RCA (pca) of RNAmix_celseq2

## cluster accuracy (eca): 0.8067
## cluster purity (ecp): 0.8795
## adjusted Rand's index (ari): 0.5029
## G index (geometric average of eca, ecp, and ari): 0.7093
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 5

vst + RCA (pca) of RNAmix_celseq2

## cluster accuracy (eca): 0.7208
## cluster purity (ecp): 0.8634
## adjusted Rand's index (ari): 0.5072
## G index (geometric average of eca, ecp, and ari): 0.6809
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 7

Townes + RCA (pca) of RNAmix_celseq2

## cluster accuracy (eca): 0.7312
## cluster purity (ecp): 0.7305
## adjusted Rand's index (ari): 0.2253
## G index (geometric average of eca, ecp, and ari): 0.4937
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 11

counts + Seurat (low resolution, pca) of RNAmix_celseq2

## cluster accuracy (eca): 0.9749
## cluster purity (ecp): 0.9546
## adjusted Rand's index (ari): 0.86
## G index (geometric average of eca, ecp, and ari): 0.9284
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 8

awst + Seurat (low resolution, pca) of RNAmix_celseq2

## cluster accuracy (eca): 0.9514
## cluster purity (ecp): 0.9172
## adjusted Rand's index (ari): 0.7341
## G index (geometric average of eca, ecp, and ari): 0.862
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 8

VST + Seurat (low resolution, pca) of RNAmix_celseq2

## cluster accuracy (eca): 0.9631
## cluster purity (ecp): 0.9318
## adjusted Rand's index (ari): 0.7623
## G index (geometric average of eca, ecp, and ari): 0.8812
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 8

Townes + Seurat (low resolution, pca) of RNAmix_celseq2

## cluster accuracy (eca): 0.8659
## cluster purity (ecp): 0.9802
## adjusted Rand's index (ari): 0.678
## G index (geometric average of eca, ecp, and ari): 0.8318
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 4

counts + Seurat (high resolution, pca) of RNAmix_celseq2

## cluster accuracy (eca): 0.9732
## cluster purity (ecp): 0.9348
## adjusted Rand's index (ari): 0.7859
## G index (geometric average of eca, ecp, and ari): 0.8942
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 9

awst + Seurat (high resolution, pca) of RNAmix_celseq2

## cluster accuracy (eca): 0.9576
## cluster purity (ecp): 0.9056
## adjusted Rand's index (ari): 0.6885
## G index (geometric average of eca, ecp, and ari): 0.8421
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 9

VST + Seurat (high resolution, pca) of RNAmix_celseq2

## cluster accuracy (eca): 0.9702
## cluster purity (ecp): 0.8676
## adjusted Rand's index (ari): 0.6395
## G index (geometric average of eca, ecp, and ari): 0.8135
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 11

Townes + Seurat (high resolution, pca) of RNAmix_celseq2

## cluster accuracy (eca): 0.8728
## cluster purity (ecp): 0.84
## adjusted Rand's index (ari): 0.5191
## G index (geometric average of eca, ecp, and ari): 0.7247
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 8

counts + SC3 of RNAmix_celseq2

## cluster accuracy (eca): 0.6101
## cluster purity (ecp): 0.8526
## adjusted Rand's index (ari): 0.1995
## G index (geometric average of eca, ecp, and ari): 0.4699
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 4

awst + SC3 of RNAmix_celseq2

## cluster accuracy (eca): 0.4983
## cluster purity (ecp): 0.948
## adjusted Rand's index (ari): 0.1819
## G index (geometric average of eca, ecp, and ari): 0.4413
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 4

VST + SC3 of RNAmix_celseq2

## cluster accuracy (eca): 0.3641
## cluster purity (ecp): 0.9804
## adjusted Rand's index (ari): 0.002
## G index (geometric average of eca, ecp, and ari): 0.0892
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 3

Townes + SC3 of RNAmix_celseq2

## cluster accuracy (eca): 0.8285
## cluster purity (ecp): 0.8513
## adjusted Rand's index (ari): 0.5455
## G index (geometric average of eca, ecp, and ari): 0.7273
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 7

counts + clustExp of RNAmix_celseq2

## cluster accuracy (eca): 0.7376
## cluster purity (ecp): 0.5934
## adjusted Rand's index (ari): 0.1606
## G index (geometric average of eca, ecp, and ari): 0.4127
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 12

awst + clustExp of RNAmix_celseq2

## cluster accuracy (eca): 0.809
## cluster purity (ecp): 0.9877
## adjusted Rand's index (ari): 0.4364
## G index (geometric average of eca, ecp, and ari): 0.7039
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 3

VST + clustExp of RNAmix_celseq2

## cluster accuracy (eca): 0.7293
## cluster purity (ecp): 0.9037
## adjusted Rand's index (ari): 0.2309
## G index (geometric average of eca, ecp, and ari): 0.5339
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 3

Townes + clustExp of RNAmix_celseq2

## cluster accuracy (eca): 0.7898
## cluster purity (ecp): 0.9429
## adjusted Rand's index (ari): 0.2907
## G index (geometric average of eca, ecp, and ari): 0.6005
## no. of clusters in theoretical partition: 7
## no. of clusters in estimated partition: 3

Supplementary Table 3

where what eca ecp ari G noOfClust_Th noOfClust_Est
sc_10x_5cl AWST protocol 0.9861 0.9806 0.9632 0.9766 5 5
sc_10x_5cl RaceID + counts 0.9767 0.8551 0.5137 0.7542 5 12
sc_10x_5cl RaceID + AWST 0.9853 0.8087 0.4089 0.6881 5 17
sc_10x_5cl RaceID + VST 0.9610 0.9081 0.6685 0.8356 5 8
sc_10x_5cl RaceID + Townes 0.8172 0.9366 0.1493 0.4852 5 2
sc_10x_5cl RCA + counts 0.9161 0.8042 0.4973 0.7155 5 19
sc_10x_5cl RCA + AWST 0.9044 0.7996 0.4613 0.6936 5 20
sc_10x_5cl RCA + VST 0.9161 0.8042 0.4973 0.7155 5 19
sc_10x_5cl RCA + Townes 0.9280 0.5664 0.1112 0.3881 5 109
sc_10x_5cl Seurat (LoRes) + counts 0.9862 0.8837 0.6000 0.8056 5 12
sc_10x_5cl Seurat (LoRes) + AWST 0.9845 0.8366 0.4787 0.7333 5 15
sc_10x_5cl Seurat (LoRes) + VST 0.9869 0.8854 0.5757 0.7953 5 12
sc_10x_5cl Seurat (LoRes) + Townes 0.9851 0.8765 0.6074 0.8064 5 12
sc_10x_5cl Seurat (HiRes) + counts 0.9837 0.7908 0.3455 0.6453 5 19
sc_10x_5cl Seurat (HiRes) + AWST 0.9848 0.7871 0.3394 0.6408 5 20
sc_10x_5cl Seurat (HiRes) + VST 0.9846 0.7921 0.3488 0.6480 5 19
sc_10x_5cl Seurat (HiRes) + Townes 0.9833 0.8208 0.4275 0.7014 5 16
sc_10x_5cl SC3 + counts 0.9811 0.8702 0.6620 0.8268 5 16
sc_10x_5cl SC3 + AWST 0.9861 0.8265 0.4292 0.7046 5 20
sc_10x_5cl SC3 + VST 0.9843 0.8792 0.6743 0.8356 5 15
sc_10x_5cl SC3 + Townes 0.9685 0.6270 0.1299 0.4289 5 76
sc_10x_5cl clustExp + counts 0.9809 0.7198 0.2680 0.5741 5 33
sc_10x_5cl clustExp + AWST 0.9881 0.8742 0.5816 0.7950 5 14
sc_10x_5cl clustExp + VST 0.9722 0.8234 0.4267 0.6990 5 15
sc_10x_5cl clustExp + Townes 0.9737 0.9243 0.7998 0.8962 5 7
sc_10x AWST protocol 0.9986 0.9988 0.9967 0.9980 3 3
sc_10x RaceID + counts 0.9983 0.7338 0.2877 0.5951 3 15
sc_10x RaceID + AWST 0.9975 0.7828 0.3667 0.6591 3 11
sc_10x RaceID + VST 0.9918 0.8339 0.4737 0.7318 3 8
sc_10x RaceID + Townes 0.8388 1.0000 0.0000 0.0000 3 1
sc_10x RCA + counts 0.8792 0.8204 0.2626 0.5743 3 6
sc_10x RCA + AWST 0.9035 0.8260 0.3623 0.6466 3 6
sc_10x RCA + VST 0.8792 0.8204 0.2626 0.5743 3 6
sc_10x RCA + Townes 0.9326 0.6146 0.1716 0.4616 3 30
sc_10x Seurat (LoRes) + counts 0.9961 0.8624 0.5615 0.7842 3 7
sc_10x Seurat (LoRes) + AWST 0.9961 0.8616 0.5564 0.7817 3 7
sc_10x Seurat (LoRes) + VST 0.9975 0.8341 0.4640 0.7281 3 8
sc_10x Seurat (LoRes) + Townes 0.9972 0.9064 0.7032 0.8598 3 6
sc_10x Seurat (HiRes) + counts 0.9970 0.7973 0.3923 0.6781 3 10
sc_10x Seurat (HiRes) + AWST 0.9968 0.7695 0.3452 0.6421 3 12
sc_10x Seurat (HiRes) + VST 0.9968 0.7784 0.3518 0.6487 3 11
sc_10x Seurat (HiRes) + Townes 0.9967 0.7935 0.3829 0.6716 3 10
sc_10x SC3 + counts 0.9578 0.8545 0.5415 0.7624 3 8
sc_10x SC3 + AWST 0.8716 0.8560 0.5448 0.7408 3 8
sc_10x SC3 + VST 0.8533 0.8901 0.6838 0.8038 3 7
sc_10x SC3 + Townes 0.9891 0.5296 0.0943 0.3669 3 50
sc_10x clustExp + counts 0.9985 0.7224 0.2874 0.5918 3 17
sc_10x clustExp + AWST 0.9994 0.8615 0.5808 0.7938 3 8
sc_10x clustExp + VST 0.9991 0.9187 0.7194 0.8708 3 5
sc_10x clustExp + Townes 0.9965 0.9393 0.8016 0.9087 3 5
RNAmix_celseq2 AWST protocol 0.7685 0.8688 0.3370 0.6082 7 4
RNAmix_celseq2 RaceID + counts 0.8033 0.8719 0.4567 0.6839 7 5
RNAmix_celseq2 RaceID + AWST 0.8291 0.9185 0.5029 0.7262 7 4
RNAmix_celseq2 RaceID + VST 0.8425 0.6821 0.3659 0.5947 7 15
RNAmix_celseq2 RaceID + Townes 0.6547 0.8763 0.0510 0.3081 7 2
RNAmix_celseq2 RCA + counts 0.7208 0.8634 0.5072 0.6809 7 7
RNAmix_celseq2 RCA + AWST 0.8067 0.8795 0.5029 0.7093 7 5
RNAmix_celseq2 RCA + VST 0.7208 0.8634 0.5072 0.6809 7 7
RNAmix_celseq2 RCA + Townes 0.7312 0.7305 0.2253 0.4937 7 11
RNAmix_celseq2 Seurat (LoRes) + counts 0.9749 0.9546 0.8600 0.9284 7 8
RNAmix_celseq2 Seurat (LoRes) + AWST 0.9514 0.9172 0.7341 0.8620 7 8
RNAmix_celseq2 Seurat (LoRes) + VST 0.9631 0.9318 0.7623 0.8812 7 8
RNAmix_celseq2 Seurat (LoRes) + Townes 0.8659 0.9802 0.6780 0.8318 7 4
RNAmix_celseq2 Seurat (HiRes) + counts 0.9732 0.9348 0.7859 0.8942 7 9
RNAmix_celseq2 Seurat (HiRes) + AWST 0.9576 0.9056 0.6885 0.8421 7 9
RNAmix_celseq2 Seurat (HiRes) + VST 0.9702 0.8676 0.6395 0.8135 7 11
RNAmix_celseq2 Seurat (HiRes) + Townes 0.8728 0.8400 0.5191 0.7247 7 8
RNAmix_celseq2 SC3 + counts 0.6101 0.8526 0.1995 0.4699 7 4
RNAmix_celseq2 SC3 + AWST 0.4983 0.9480 0.1819 0.4413 7 4
RNAmix_celseq2 SC3 + VST 0.3641 0.9804 0.0020 0.0892 7 3
RNAmix_celseq2 SC3 + Townes 0.8285 0.8513 0.5455 0.7273 7 7
RNAmix_celseq2 clustExp + counts 0.7376 0.5934 0.1606 0.4127 7 12
RNAmix_celseq2 clustExp + AWST 0.8090 0.9877 0.4364 0.7039 7 3
RNAmix_celseq2 clustExp + VST 0.7293 0.9037 0.2309 0.5339 7 3
RNAmix_celseq2 clustExp + Townes 0.7898 0.9429 0.2907 0.6005 7 3
rm(list = ls())
setwd("~/Dropbox/AWST/mixology/")
ttable <- read.csv("mixoloy20200925_results.tsv", sep = "\t")
jobName <- "mixoloy20200925"
library(xtable)
x.table <- xtable(ttable)
#print(xtable(ttable), include.rownames = FALSE)

Figure 4

#rm(list = ls())
#setwd("~/Dropbox/AWST/mixology/")
#ttable <- read.csv("mixoloy20200925_results.tsv", sep = "\t")
#jobName <- "mixoloy20200925"
save_png <- FALSE
ttable <- results

ttable$method <- NA

awst_protocol <- ttable[grep("protocol", ttable$what),]
ttable <- ttable[-grep("protocol", ttable$what),]

ttable <- ttable[-grep("LoRes", ttable$what),]

ttable$method[grep("AWST", ttable$what)] <- "AWST"
ttable$method[grep("VST", ttable$what)] <- "VST"
ttable$method[grep("counts", ttable$what)] <- "counts"
ttable$method[grep("Townes", ttable$what)] <- "Townes"

if(save_png) png(file = paste0(jobName, "_final_performance.png"), width = 600, height = 350, res =  1/300)
boxplot(ttable$G ~ ttable$method, ylim = c(0, 1),
        xlab = "pre-processing methods",
        ylab = "performance (G index)")#, border = "gray50")
points(rep(1.13, 3), awst_protocol$G, pch = 20)

#text(rep(1.13, 3), awst_protocol$G + c(0.015, -0.055, -0.02), awst_protocol$where, pos = 4)
#text(rep(1.13, 3), awst_protocol$G + c(0.015, -0.055, -0.02), awst_protocol$what, pos = 2)
#dev.off()

Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas/libblas.so.3.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=it_IT.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=it_IT.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=it_IT.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] clues_0.6.2.2           dendextend_1.13.4       steFunctions_2019.04.29
## [4] knitr_1.28             
## 
## loaded via a namespace (and not attached):
##  [1] umap_0.2.5.0      Rcpp_1.0.3        RSpectra_0.16-0   highr_0.8        
##  [5] compiler_3.6.3    pillar_1.4.3      viridis_0.5.1     iterators_1.0.12 
##  [9] tools_3.6.3       digest_0.6.25     jsonlite_1.6.1    lattice_0.20-41  
## [13] evaluate_0.14     lifecycle_0.2.0   tibble_2.1.3      gtable_0.3.0     
## [17] viridisLite_0.3.0 pkgconfig_2.0.3   rlang_0.4.5       Matrix_1.2-18    
## [21] foreach_1.4.8     rstudioapi_0.11   yaml_2.2.1        parallel_3.6.3   
## [25] xfun_0.12         gridExtra_2.3     stringr_1.4.0     dplyr_0.8.5      
## [29] askpass_1.1       grid_3.6.3        tidyselect_1.0.0  reticulate_1.14  
## [33] glue_1.3.2        R6_2.4.1          rmarkdown_2.1     ggplot2_3.3.0    
## [37] purrr_0.3.3       magrittr_1.5      scales_1.1.0      codetools_0.2-16 
## [41] htmltools_0.4.0   assertthat_0.2.1  colorspace_1.4-1  stringi_1.4.6    
## [45] openssl_1.4.1     doParallel_1.0.15 munsell_0.5.0     crayon_1.3.4